library("ggplot2") 
library("gridExtra")
library("readr")
library("dplyr") 
library("lubridate")
library("janitor")
library("tidyr")
library("tidyverse")
library("DataExplorer")
library("reshape2")
library("data.table")
library("DT")
library("d3heatmap")
library("tigerstats")
library("corrplot")
library("viridis")
library("plotly")
library("tm")
library("RColorBrewer")
library("leaflet")
library("wordcloud")
raw_crime = read.csv("E:/Masters/Sem 5/comp & visua analystics/b.csv", sep = ",", na.strings =c('','NA','na','N/A','n/a','NaN','nan'), strip.white = TRUE, stringsAsFactors = FALSE)
df<- raw_crime
# Cleaning the data
df <- clean_names(df)
df_info <- function(x) {
  data  <- as.character(substitute(x))  
  size <- format(object.size(x), units="Mb") 
  
  plot_missing(data.frame(x))
  
  column.info <- data.frame( column        = names(sapply(x, class)),
                             unique.values = sapply(x, function(y) length(unique(y))),
                             missing.count = colSums(is.na(x)),
                             missing.pct   = round(colSums(is.na(x)) / nrow(x) * 100, 2))
                            
  row.names(column.info) <- 1:nrow(column.info)
  list(data.frame     = data.frame(name=data, size=size),
       dimensions     = data.frame(rows=nrow(x), columns=ncol(x)),
       column.details = column.info)
}
Sys.timezone() 
## [1] "America/New_York"
df1 = read.csv("E:/Masters/Sem 5/comp & visua analystics/a.csv", sep = ",", na.strings =c('','NA','na','N/A','n/a','NaN','nan'), strip.white = TRUE, stringsAsFactors = FALSE)
df4 <- df1
df4 <- clean_names(df4)

# Writing function to get info about our datasets
df_info <- function(x) {
  data  <- as.character(substitute(x))  ##data frame name
  size <- format(object.size(x), units="Mb")  ##size of data frame in Mb
  
  plot_missing(data.frame(x))+
  theme_classic(
    
  )# Vizualization of Missing Data.
  
  ##column information
  column.info <- data.frame( column        = names(sapply(x, class)),
                             #class         = sapply(x, class),
                             unique.values = sapply(x, function(y) length(unique(y))),
                             missing.count = colSums(is.na(x)),
                             missing.pct   = round(colSums(is.na(x)) / nrow(x) * 100, 2))
                            
  row.names(column.info) <- 1:nrow(column.info)
  list(data.frame     = data.frame(name=data, size=size),
       dimensions     = data.frame(rows=nrow(x), columns=ncol(x)),
       column.details = column.info)
}
Sys.timezone() # Will Display Time zone of your zone
## [1] "America/New_York"
# Information about the datasets
df_info(df4)

## $data.frame
##   name    size
## 1  df4 82.1 Mb
## 
## $dimensions
##     rows columns
## 1 365291      17
## 
## $column.details
##                 column unique.values missing.count missing.pct
## 1      incident_number        322293             0        0.00
## 2         offense_code           222             0        0.00
## 3   offense_code_group            67             0        0.00
## 4  offense_description           244             0        0.00
## 5             district            13          1993        0.55
## 6       reporting_area           880         23204        6.35
## 7             shooting             2        363813       99.60
## 8     occurred_on_date        265774             0        0.00
## 9                 year             5             0        0.00
## 10               month            12             0        0.00
## 11         day_of_week             7             0        0.00
## 12                hour            24             0        0.00
## 13            ucr_part             5            99        0.03
## 14              street          4748         11536        3.16
## 15                 lat         18484         23399        6.41
## 16                long         18484         23399        6.41
## 17            location         18499             0        0.00
time_diff <- c("0", "6", "12", "18", "24") # Breaking day into 4 intervals
df$time_diff <- cut(df$hour, 
                      breaks = time_diff,
                      labels = c("00-06", "06-12", "12-18", "18-24"), 
                      include.lowest = TRUE)
table(df$time_diff)
## 
## 00-06 06-12 12-18 18-24 
## 38643 74361 92128 58697
#createing Shift plot
df <- df %>% mutate(shift = ifelse(time_diff == "00-06", "Late Night",
                                                     ifelse(time_diff == "06-12", "Morning",
                                                             ifelse(time_diff == "12-18", "Day",
                                                                    "Evening"))))
x <- table(df$shift)
x <- as.table(x)
x/sum(margin.table(x, 1))
## 
##        Day    Evening Late Night    Morning 
##  0.3491959  0.2224812  0.1464699  0.2818530
plot_crime_offense_category = plot_ly(df, x = ~offense_code_group , color = ~shift) %>% 
  add_histogram() %>%
  layout(
    title = "Total crime count distributed by hour",
    xaxis = list(title = "crime",
    yaxis = list(title = "Count"
    
#marker = list(color = colorRampPalette(brewer.pal(11,"Spectral"))(100))
  )))
plot_crime_offense_category
ca_crime_df <- df[which(as.numeric(df$year) < 2018), ]
ca_crime_df %>%
  filter(!is.na(ca_crime_df$district)) %>%
  group_by(district) %>%
  summarise(count = n(),na.rm = TRUE) %>%
  arrange(desc(count)) %>% 
  ungroup() %>%
  mutate(district = reorder(district, count)) %>% 
  ggplot(aes(x = district, y = count))+
  geom_bar(stat = "identity", color = "white", fill = "skyblue")+
  geom_text(aes(x= district,  label = paste0(count, sep = "")),
            hjust =1, vjust =.5, size = 3, color = 'black', fontface = 'italic')+
  labs(x = "Neighborhood", y = "count",title = "Total crime in Each Neighborhood in 2016 & 2017 ")+
  coord_flip()+ theme_classic()

#df_2018 <- df[which(as.numeric(df$year) == 2018), ]
df %>%
  filter(!is.na(district)) %>%
  group_by(district) %>%
  summarise(count = n(),na.rm = TRUE) %>%
  arrange(desc(count)) %>% 
  ungroup() %>%
  mutate(district = reorder(district, count)) %>% 
  ggplot(aes(x = district, y = count))+
  geom_bar(stat = "identity", color = "white", fill = "skyblue")+
    geom_text(aes(x= district, label = paste0(count, sep = "")),
            hjust =.5, vjust =0, size = 3, color = 'black', fontface = 'italic')+
  labs(x = "Neighborhood", y = "Count", title = "Total crime in each Neighboorhood for 2016 - 2018")+
   theme(axis.text.x = element_text(angle = 90, hjust = 1))

df %>%
  filter(!is.na(offense_code_group)) %>%
    group_by(offense_code_group) %>%
    summarise(count = n(),na.rm = TRUE) %>%
    arrange(desc(count)) %>% 
    ungroup() %>%
    mutate(offense_code_group = reorder(offense_code_group, count)) %>% 
    head(10)%>% 
    ggplot(aes(x = offense_code_group, y = count,fill = offense_code_group)) +
    geom_bar(stat = "identity", color = "white") +
    geom_text(aes(x= offense_code_group, label = paste0( "  ",count)),
              hjust =1, vjust =.5, size = 4, color = 'black', fontface = 'italic')+
  labs(x = "Crime", y = "Count", title = "Top crime in Boston distibuted")+
  coord_flip()+
  theme_classic()

# length(unique(df$crime)) # We have 54 different types of crime which are reported.
y <- ca_crime_df %>% filter(!is.na(offense_code_group)) %>% group_by(offense_code_group) %>% summarise(count = n(),na.rm = TRUE) %>% arrange(desc(count)) %>% ungroup() %>% mutate(offense_code_group = reorder(offense_code_group, count))
z <- df %>%
  filter(!is.na(year)) %>%
    group_by(year) %>%
    summarise(count = n(),na.rm = TRUE) %>%
    arrange(desc(count)) %>% 
    ungroup() %>%
    mutate(year = reorder(year, count)) #%>% 
    ggplot(z, aes(x = year, y = count))+ scale_color_gradient(low = "white", high = "red") +
    geom_bar(stat = "identity", color = "blue", fill = "Lightblue", width = 0.25)+
    geom_text(aes(x= year, label = paste0(" ",count)),
              hjust =1, vjust =.25, size = 4, color = 'black', fontface = 'italic')+
    labs(x = "Crime", y = "Count", title = "Total crime in Boston from year 2016-2018 ")+
    coord_flip()+
    theme_classic()

ggplot(subset(df,!is.na(district)))+
  aes(x=month, color=district)+
  geom_line(stat="count")+
  scale_x_continuous(breaks = seq(1,12,1))+
  labs(title="No. Incidents by Neighborhood on Monthly Basis", x="Neighborhood", y="Number of Incidents")+
  theme_classic()

ggplot(df, aes(x = hour)) +
  geom_area(width=0.8, stat="Count") +
    labs(title="Reported Crime start time (in Hours)", x="Hour (Format - 24Hrs)", y="Number of Count")+
  theme_classic()

counts <- summarise(group_by(ca_crime_df, offense_code_group,month),Counts=length(offense_code_group))
counts <- counts[order(counts$month), ]
crime_plot <- dcast(counts,month ~ offense_code_group, value.var = "Counts" )
crime_plot[is.na(crime_plot)] <- 0
row.names(crime_plot) <- crime_plot$month # Make month row names
crime_plot = crime_plot[,-1] # Remove first
crime_plot <- cor(crime_plot)
corrplot(crime_plot, type = "lower", order = "hclust", method = "color",
         tl.col = "black", tl.srt =  45,number.cex=0.60,tl.cex = 0.50)+
  theme_classic()

## NULL
# What are top Crimes in each district ?
district_by_crime <- ca_crime_df  %>% 
  group_by(district, offense_code_group) %>% 
  dplyr::summarise(Total = n()) %>% 
  arrange(desc(Total)) %>% top_n(n = 1)
## Selecting by Total
head(district_by_crime,10)
## # A tibble: 10 x 3
## # Groups:   district [10]
##    district      offense_code_group              Total
##    <chr>         <chr>                           <int>
##  1 South End     Larceny                          4113
##  2 Roxbury       Motor Vehicle Accident Response  3956
##  3 Dorchester    Motor Vehicle Accident Response  3250
##  4 Downtown      Larceny                          2621
##  5 Mattapan      Motor Vehicle Accident Response  2382
##  6 Brighton      Motor Vehicle Accident Response  1794
##  7 South Boston  Motor Vehicle Accident Response  1633
##  8 Hyde Park     Motor Vehicle Accident Response  1431
##  9 Jamaica Plain Motor Vehicle Accident Response  1326
## 10 West Roxbury  Motor Vehicle Accident Response  1142
# Lets convert above table into dataframe 2016-2017
district_by_crime <- as.data.frame(district_by_crime)
district_by_crime$district <- factor(district_by_crime$district)
district_by_crime$offense_code_group <- factor(district_by_crime$offense_code_group)
district_by_crime <- as.data.frame(district_by_crime)
ggplot(district_by_crime, aes(reorder(district,Total), Total, fill = offense_code_group)) + 
  geom_bar(stat = "identity") + 
  ggtitle("Top Crime in each district in 2016-2017") +
  geom_text(aes(x= district, label = paste0(" ",Total)),
            hjust =1, vjust =.25, size = 4, color = 'black', fontface = 'italic')+
  xlab("district") + 
  ylab("Total Count") + scale_fill_discrete(name = "Offense Category") +
  coord_flip()+ theme_classic()

# This is for year 2018
district_by_crime_2018 <- df %>% 
  filter(year == 2018) %>% 
  group_by(district, offense_code_group) %>% 
  dplyr::summarise(Total = n()) %>% 
  arrange(desc(Total)) %>% top_n(n = 1)
## Selecting by Total
district_by_crime_2018 <- as.data.frame(district_by_crime_2018)
district_by_crime_2018$district <- factor(district_by_crime_2018$district)
district_by_crime_2018$offense_code_group <- factor(district_by_crime_2018$offense_code_group)
ggplot(district_by_crime_2018, aes(reorder(district,Total), Total, fill = offense_code_group)) + 
  geom_bar(stat = "identity") + 
  ggtitle("Top Crime in each district 2018") +
  geom_text(aes(x= district, label = paste0(" ",Total)),
            hjust =1, vjust =.25, size = 3, color = 'black', fontface = 'italic')+
  xlab("District") + 
  ylab("Total Count") + scale_fill_discrete(name = "Offense Category") +
  coord_flip() + theme_classic()